# 软件开发人员的薪金 线性回归 哑元变量， 需要设置哑元
import numpy as np
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error
import pandas as pd

data = np.genfromtxt('data3.txt', delimiter="\t")
# print(data)
y = data[:, [1]]
x12 = data[:, [2, 3]]
edu = data[:, [4]]
x3 = np.zeros_like(edu)
x4 = np.zeros_like(edu)
x5 = np.zeros_like(edu)

x3[edu == 1] = 1  # 处理哑元变量
x4[edu == 2] = 1
x5[edu == 3] = 1
# dummy = pd.get_dummies(edu, drop_first=True)#.to_numpy()
# # dummy = sm.categorical(edu)
# print(dummy.shape)
# print(dummy)

# x = np.concatenate((x12, x3, x4), axis=1)
x = np.hstack((x12, x3, x4, x5))

x = sm.add_constant(x)
model = sm.OLS(y, x).fit()

pred = model.fittedvalues
rmse = np.sqrt(mean_squared_error(y, pred))
print(rmse)

print(model.summary())

